library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.1     ✔ rsample      1.1.0
## ✔ dials        1.0.0     ✔ tune         1.0.0
## ✔ infer        1.0.3     ✔ workflows    1.1.0
## ✔ modeldata    1.0.1     ✔ workflowsets 1.0.0
## ✔ parsnip      1.0.2     ✔ yardstick    1.1.0
## ✔ recipes      1.0.1     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(lubridate)
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggcorrplot)
library(Factoshiny)
## Loading required package: FactoMineR
## Loading required package: shiny
## 
## Attaching package: 'shiny'
## 
## The following object is masked from 'package:infer':
## 
##     observe
## 
## Loading required package: FactoInvestigate
library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Attaching package: 'arules'
## 
## The following object is masked from 'package:recipes':
## 
##     discretize
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(RColorBrewer)
library(viridis)
## Loading required package: viridisLite
## 
## Attaching package: 'viridis'
## 
## The following object is masked from 'package:scales':
## 
##     viridis_pal
library(ggrepel)
options(ggrepel.max.overlaps = Inf)

Read data

df <- read_csv("~/Documentos/GitHub/MVA-Project/df_preprocessed.csv")
## Rows: 367645 Columns: 42
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): Name, Sex, Event, Equipment, Place, Tested, Country, Federation, ...
## dbl  (31): Age, BodyweightKg, Squat1Kg, Squat2Kg, Squat3Kg, Bench1Kg, Bench2...
## date  (1): Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
numeric_cols <- df %>%
  select_if(is.numeric)

cols_zero_var <- df %>% 
  select(colnames(numeric_cols)[1:13])
round(cor(cols_zero_var, use = "complete.obs"), 1) %>%
  ggcorrplot()

Remember that these correlations are using the negative values of the lift. Let’s change that

round(cor(cols_zero_var %>% mutate_if(is.numeric, abs), use = "complete.obs"), 1) %>%
  ggcorrplot(method = "circle", type = "lower")

Makes sense

pca_rec <- recipe(~., data = cols_zero_var %>% mutate_if(is.numeric, abs)) %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors()) 

pca_prep <- prep(pca_rec)

tidied_pca <- tidy(pca_prep, 2)

tidied_pca %>%
  filter(component %in% paste0("PC", 1:9)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(value, terms, fill = terms)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~component, nrow = 1) +
  labs(y = NULL)

summary(pca_prep$steps[[2]]$res)
## Importance of components:
##                          PC1     PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     3.221 1.03064 0.87961 0.59273 0.46153 0.43331 0.12534
## Proportion of Variance 0.798 0.08171 0.05952 0.02702 0.01639 0.01444 0.00121
## Cumulative Proportion  0.798 0.87971 0.93922 0.96625 0.98263 0.99708 0.99829
##                            PC8     PC9    PC10    PC11    PC12    PC13
## Standard deviation     0.07941 0.07397 0.06822 0.04907 0.04226 0.04081
## Proportion of Variance 0.00049 0.00042 0.00036 0.00019 0.00014 0.00013
## Cumulative Proportion  0.99877 0.99919 0.99955 0.99973 0.99987 1.00000
juice(pca_prep) %>%
  ggplot(aes(PC1, PC2)) +
  geom_point(alpha = 0.7, size = 2) +
  labs(color = NULL)

pca_cumm <- as_data_frame(summary(pca_prep$steps[[2]]$res)$importance)[3, ] %>%
  gather()
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
pca_cumm %>%
  mutate(key = fct_inorder(key)) %>%
  filter(key %in% paste0("PC", 1:5)) %>%
  ggplot(aes(x = key, y = value)) + geom_col(fill = "#91E5F8") 

With factoextra

res.pca <- PCA(cols_zero_var %>% mutate_if(is.numeric, abs),  graph = FALSE)
eig <- get_eig(res.pca)
eig
##          eigenvalue variance.percent cumulative.variance.percent
## Dim.1  10.373974176      79.79980135                    79.79980
## Dim.2   1.062223239       8.17094799                    87.97075
## Dim.3   0.773708414       5.95160318                    93.92235
## Dim.4   0.351324444       2.70249572                    96.62485
## Dim.5   0.213010307       1.63854082                    98.26339
## Dim.6   0.187759475       1.44430365                    99.70769
## Dim.7   0.015709441       0.12084185                    99.82853
## Dim.8   0.006306282       0.04850986                    99.87704
## Dim.9   0.005471419       0.04208784                    99.91913
## Dim.10  0.004654173       0.03580133                    99.95493
## Dim.11  0.002407607       0.01852006                    99.97345
## Dim.12  0.001785825       0.01373712                    99.98719
## Dim.13  0.001665200       0.01280923                   100.00000
fviz_screeplot(res.pca, addlabels = TRUE, ylim = c(0, 50))

var <- get_pca_var(res.pca)
var
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"
fviz_pca_var(res.pca, col.var="contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE # Avoid text overlapping
             )

# Contributions of variables to PC1
fviz_contrib(res.pca, choice = "var", axes = 1, top = 10)

# Contributions of variables to PC2
fviz_contrib(res.pca, choice = "var", axes = 2, top = 10)

fviz_pca_ind(res.pca,
             label = "none", # hide individual labels
             habillage = as_factor(df$Sex), # color by groups
             palette = c("#00AFBB", "#E7B800", "#FC4E07"),
             addEllipses = TRUE, # Concentration ellipses
             alpha.ind = 0.3
             )

df_mca <- df %>%
  mutate(across(ends_with("?"), ~factor(.x, labels = c("Not lifted", "Lifted")))) %>%
  select(Sex, Equipment, Tested, Federation, ends_with("?")) %>%
  mutate(across(1:5, ~factor(.x))) 
#%>% sample_n(1000)

#result <- Factoshiny(df_mca)

res.MCA <- MCA(df_mca,graph=FALSE)
plot.MCA(res.MCA, choix='var')

plot.MCA(res.MCA,invisible= 'ind',selectMod= 'cos2 0.2',label =c('var'), graph.type="ggplot") + 
  geom_text_repel() + ggtitle("Mca factor map with cos2 >= 0.2")

#Values 
summary(res.MCA)
## 
## Call:
## MCA(X = df_mca, graph = FALSE) 
## 
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6   Dim.7
## Variance               0.165   0.133   0.099   0.087   0.083   0.081   0.081
## % of var.              4.049   3.259   2.433   2.139   2.046   1.998   1.979
## Cumulative % of var.   4.049   7.309   9.742  11.881  13.928  15.925  17.904
##                        Dim.8   Dim.9  Dim.10  Dim.11  Dim.12  Dim.13  Dim.14
## Variance               0.079   0.077   0.077   0.077   0.077   0.077   0.077
## % of var.              1.931   1.897   1.894   1.890   1.889   1.888   1.887
## Cumulative % of var.  19.835  21.733  23.626  25.516  27.405  29.293  31.180
##                       Dim.15  Dim.16  Dim.17  Dim.18  Dim.19  Dim.20  Dim.21
## Variance               0.077   0.077   0.077   0.077   0.077   0.077   0.077
## % of var.              1.887   1.887   1.887   1.887   1.887   1.887   1.887
## Cumulative % of var.  33.066  34.953  36.840  38.727  40.614  42.500  44.387
##                       Dim.22  Dim.23  Dim.24  Dim.25  Dim.26  Dim.27  Dim.28
## Variance               0.077   0.077   0.077   0.077   0.077   0.077   0.077
## % of var.              1.887   1.887   1.887   1.887   1.887   1.887   1.887
## Cumulative % of var.  46.274  48.161  50.048  51.934  53.821  55.708  57.595
##                       Dim.29  Dim.30  Dim.31  Dim.32  Dim.33  Dim.34  Dim.35
## Variance               0.077   0.077   0.077   0.077   0.077   0.077   0.077
## % of var.              1.887   1.887   1.887   1.887   1.887   1.887   1.887
## Cumulative % of var.  59.482  61.368  63.255  65.142  67.029  68.915  70.802
##                       Dim.36  Dim.37  Dim.38  Dim.39  Dim.40  Dim.41  Dim.42
## Variance               0.077   0.077   0.077   0.077   0.077   0.076   0.074
## % of var.              1.887   1.887   1.887   1.887   1.887   1.862   1.806
## Cumulative % of var.  72.689  74.576  76.463  78.349  80.236  82.098  83.904
##                       Dim.43  Dim.44  Dim.45  Dim.46  Dim.47  Dim.48  Dim.49
## Variance               0.073   0.073   0.072   0.072   0.070   0.068   0.066
## % of var.              1.790   1.786   1.775   1.767   1.711   1.665   1.621
## Cumulative % of var.  85.694  87.480  89.255  91.021  92.732  94.397  96.018
##                       Dim.50  Dim.51  Dim.52  Dim.53
## Variance               0.066   0.060   0.032   0.005
## % of var.              1.609   1.472   0.790   0.111
## Cumulative % of var.  97.627  99.098  99.889 100.000
## 
## Individuals (the 10 first)
##               Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr   cos2
## 1          |  0.102  0.000  0.005 |  0.067  0.000  0.002 |  0.105  0.000  0.005
## 2          |  0.135  0.000  0.007 |  0.107  0.000  0.005 |  0.143  0.000  0.008
## 3          | -0.129  0.000  0.008 | -0.163  0.000  0.013 |  0.401  0.000  0.075
## 4          |  0.098  0.000  0.004 |  0.090  0.000  0.003 | -0.188  0.000  0.014
## 5          | -0.129  0.000  0.008 | -0.163  0.000  0.013 |  0.401  0.000  0.075
## 6          | -0.129  0.000  0.008 | -0.163  0.000  0.013 |  0.401  0.000  0.075
## 7          |  0.164  0.000  0.012 |  0.156  0.000  0.011 | -0.076  0.000  0.003
## 8          |  0.267  0.000  0.030 |  0.254  0.000  0.027 | -0.245  0.000  0.025
## 9          |  0.078  0.000  0.003 |  0.038  0.000  0.001 |  0.103  0.000  0.005
## 10         |  0.013  0.000  0.000 | -0.032  0.000  0.000 |  0.302  0.000  0.043
##             
## 1          |
## 2          |
## 3          |
## 4          |
## 5          |
## 6          |
## 7          |
## 8          |
## 9          |
## 10         |
## 
## Categories (the 10 first)
##                 Dim.1      ctr     cos2   v.test      Dim.2      ctr     cos2
## F          |   -0.140    0.301    0.010  -59.557 |   -0.105    0.209    0.005
## M          |    0.069    0.148    0.010   59.557 |    0.052    0.103    0.005
## Raw        |   -0.408    5.188    0.336 -351.208 |   -0.304    3.563    0.185
## Single-ply |    0.819   10.356    0.332  349.537 |    0.617    7.298    0.189
## Wraps      |    2.194    0.146    0.003   33.999 |   -2.381    0.214    0.004
## No         |    6.728   26.829    0.583  463.040 |   -5.449   21.861    0.383
## Yes        |   -0.087    0.346    0.583 -463.040 |    0.070    0.282    0.383
## AEP        |   -0.407    0.066    0.001  -22.861 |   -0.294    0.042    0.001
## APU        |   -0.674    0.216    0.005  -41.500 |   -0.723    0.309    0.005
## AsianPF    |    0.567    0.113    0.002   30.037 |    0.887    0.346    0.006
##              v.test      Dim.3      ctr     cos2   v.test  
## F           -44.521 |   -0.192    0.940    0.018  -81.558 |
## M            44.521 |    0.094    0.463    0.018   81.558 |
## Raw        -261.122 |   -0.326    5.498    0.214 -280.270 |
## Single-ply  263.255 |    0.657   11.071    0.213  280.152 |
## Wraps       -36.905 |    0.305    0.005    0.000    4.732 |
## No         -375.012 |   -0.042    0.002    0.000   -2.921 |
## Yes         375.012 |    0.001    0.000    0.000    2.921 |
## AEP         -16.498 |   -1.109    0.812    0.011  -62.319 |
## APU         -44.512 |   -0.024    0.000    0.000   -1.465 |
## AsianPF      47.034 |    1.794    1.893    0.025   95.104 |
## 
## Categorical variables (eta2)
##                   Dim.1 Dim.2 Dim.3  
## Sex             | 0.010 0.005 0.018 |
## Equipment       | 0.337 0.191 0.214 |
## Tested          | 0.583 0.383 0.000 |
## Federation      | 0.759 0.667 0.400 |
## LiftedSquat1Kg? | 0.029 0.040 0.015 |
## LiftedSquat2Kg? | 0.081 0.068 0.045 |
## LiftedSquat3Kg? | 0.068 0.051 0.113 |
## LiftedBench1Kg? | 0.019 0.053 0.000 |
## LiftedBench2Kg? | 0.036 0.053 0.092 |
## LiftedBench3Kg? | 0.010 0.008 0.243 |
#Description of the axis

#eigenvalues > 1/n_categories

#measures the degree of association between variable categories and a particular axis
fviz_mca_var(res.MCA, alpha.var = "cos2", select.var = list(cos2 = 0.15),
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             repel = TRUE, # Avoid text overlapping
             ggtheme = theme_minimal())

#visualize the inertia kept by dimensions

fviz_screeplot(res.MCA, addlabels = TRUE, ylim = c(0, 45))

#quality of representation  
fviz_cos2(res.MCA, choice = "var", axes = 1:2)

df_mfa <- df %>%
  mutate(across(ends_with("?"), ~factor(.x, labels = c("Not lifted", "Lifted")))) %>%
  select(Sex, Equipment, Tested, Federation, ends_with("?"), where(is.numeric)) %>%
  mutate(across(1:5, ~factor(.x))) %>%
  mutate_if(is.numeric, abs)

#We have to decide groups

# Lifter associated health and condition
# Weight tried
# Quality of the lifts

df_mfa <- df_mfa %>%
  select(Age, BodyweightKg,
         contains("WeightTried"),
         ends_with("?"))

res.mfa <- MFA(df_mfa, 
               group = c(2, 9, 9), 
               type = c("s", "s", "n"),
               name.group = c("Lifter condition", "Weight tried", "Quality of the lift"),
               graph = FALSE)

head(get_eigenvalue(res.mfa))
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1  1.5975177        18.317722                    18.31772
## Dim.2  1.0184891        11.678368                    29.99609
## Dim.3  0.9031305        10.355624                    40.35171
## Dim.4  0.6876887         7.885290                    48.23700
## Dim.5  0.6424357         7.366403                    55.60341
## Dim.6  0.6316142         7.242319                    62.84573
fviz_screeplot(res.mfa)

group_mfa <- get_mfa_var(res.mfa, "group")
fviz_mfa_var(res.mfa, "group")

fviz_contrib(res.mfa, "group", axes = 1)

fviz_contrib(res.mfa, "group", axes = 2)

get_mfa_var(res.mfa, "quanti.var")
## Multiple Factor Analysis results for quantitative variables 
##  ===================================================
##   Name       Description                      
## 1 "$coord"   "Coordinates"                    
## 2 "$cos2"    "Cos2, quality of representation"
## 3 "$contrib" "Contributions"
fviz_mfa_var(res.mfa, "quanti.var", palette = "jco", 
             col.var.sup = "violet", repel = TRUE,
             geom = c("point", "text"), legend = "bottom")

fviz_contrib(res.mfa, choice = "quanti.var", axes = 1, top = 20,
             palette = "jco")

fviz_contrib(res.mfa, choice = "quanti.var", axes = 2, top = 20,
             palette = "jco")

fviz_mfa_var(res.mfa, "quanti.var", col.var = "contrib", 
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             col.var.sup = "violet", repel = TRUE,
             geom = c("point", "text"))

fviz_mfa_var(res.mfa, col.var = "cos2",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             col.var.sup = "violet", repel = TRUE, geom = c("point", "text"))

fviz_cos2(res.mfa, choice = "quanti.var", axes = 1)

#The graph of partial axes shows the relationship between the principal axes 
#of the MFA and the ones obtained from analyzing each group using either a PCA 
#(for groups of continuous variables) or a MCA (for qualitative variables).

fviz_mfa_axes(res.mfa, geom = c("point", "text"), graph.type="ggplot", repel = T)

######Association Rules######

itemFrequencyPlot(transactions(df_mca), topN = 20,
                  col = brewer.pal(8, 'BuPu'),
                  main = 'Relative Item Frequency Plot',
                  type = "relative",
                  ylab = "Item Frequency (Relative)")

#https://www.datarlabs.com/post/market-basket-optimisation-using-association-rule-mining

df_eclat <- transactions(df_mca)

rules_eclat = eclat(data = df_eclat, 
              parameter = list(support = 0.6, minlen = 2))
## Eclat
## 
## parameter specification:
##  tidLists support minlen maxlen            target  ext
##     FALSE     0.6      2     10 frequent itemsets TRUE
## 
## algorithmic control:
##  sparse sort verbose
##       7   -2    TRUE
## 
## Absolute minimum support count: 220587 
## 
## create itemset ... 
## set transactions ...[66 item(s), 367645 transaction(s)] done [0.24s].
## sorting and recoding items ... [10 item(s)] done [0.02s].
## creating bit matrix ... [10 row(s), 367645 column(s)] done [0.02s].
## writing  ... [88 set(s)] done [0.00s].
## Creating S4 object  ... done [0.00s].
inspect(sort(rules_eclat, by = 'support')[1:10])
##      items                          support  count
## [1]  {Tested=Yes,                                 
##       LiftedDeadlift1Kg?=Lifted}  0.9401814 345653
## [2]  {Tested=Yes,                                 
##       LiftedBench1Kg?=Lifted}     0.8896653 327081
## [3]  {LiftedBench1Kg?=Lifted,                     
##       LiftedDeadlift1Kg?=Lifted}  0.8604551 316342
## [4]  {Tested=Yes,                                 
##       LiftedSquat1Kg?=Lifted}     0.8566226 314933
## [5]  {Tested=Yes,                                 
##       LiftedDeadlift2Kg?=Lifted}  0.8558637 314654
## [6]  {Tested=Yes,                                 
##       LiftedBench1Kg?=Lifted,                     
##       LiftedDeadlift1Kg?=Lifted}  0.8495097 312318
## [7]  {LiftedDeadlift1Kg?=Lifted,                  
##       LiftedDeadlift2Kg?=Lifted}  0.8291803 304844
## [8]  {LiftedSquat1Kg?=Lifted,                     
##       LiftedDeadlift1Kg?=Lifted}  0.8288430 304720
## [9]  {Tested=Yes,                                 
##       LiftedDeadlift1Kg?=Lifted,                  
##       LiftedDeadlift2Kg?=Lifted}  0.8203539 301599
## [10] {Tested=Yes,                                 
##       LiftedSquat1Kg?=Lifted,                     
##       LiftedDeadlift1Kg?=Lifted}  0.8190619 301124
#Tested -> may imply better tournament so better lifters
df_apriori <- transactions(df_mca)

#colors just because I can


rules_apriori <- apriori(df_apriori, parameter = list(supp = 0.1, conf = 0.8))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5     0.1      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 36764 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[66 item(s), 367645 transaction(s)] done [0.21s].
## sorting and recoding items ... [22 item(s)] done [0.03s].
## creating transaction tree ... done [0.34s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(df_apriori, parameter = list(supp = 0.1, conf = 0.8)): Mining
## stopped (maxlen reached). Only patterns up to a length of 10 returned!
##  done [0.04s].
## writing ... [33968 rule(s)] done [0.59s].
## creating S4 object  ... done [0.07s].
inspect(rules_apriori[1:20])
##      lhs                                rhs                           support confidence  coverage      lift  count
## [1]  {}                              => {LiftedSquat2Kg?=Lifted}    0.8027771  0.8027771 1.0000000 1.0000000 295137
## [2]  {}                              => {LiftedDeadlift2Kg?=Lifted} 0.8652341  0.8652341 1.0000000 1.0000000 318099
## [3]  {}                              => {LiftedSquat1Kg?=Lifted}    0.8670484  0.8670484 1.0000000 1.0000000 318766
## [4]  {}                              => {LiftedBench1Kg?=Lifted}    0.9013641  0.9013641 1.0000000 1.0000000 331382
## [5]  {}                              => {LiftedDeadlift1Kg?=Lifted} 0.9520761  0.9520761 1.0000000 1.0000000 350026
## [6]  {}                              => {Tested=Yes}                0.9872785  0.9872785 1.0000000 1.0000000 362968
## [7]  {LiftedSquat1Kg?=Not lifted}    => {LiftedDeadlift2Kg?=Lifted} 0.1093773  0.8226846 0.1329516 0.9508231  40212
## [8]  {LiftedSquat1Kg?=Not lifted}    => {LiftedBench1Kg?=Lifted}    0.1130438  0.8502629 0.1329516 0.9433068  41560
## [9]  {LiftedSquat1Kg?=Not lifted}    => {LiftedDeadlift1Kg?=Lifted} 0.1232330  0.9269011 0.1329516 0.9735579  45306
## [10] {LiftedSquat1Kg?=Not lifted}    => {Tested=Yes}                0.1306559  0.9827329 0.1329516 0.9953958  48035
## [11] {LiftedDeadlift2Kg?=Not lifted} => {LiftedSquat1Kg?=Lifted}    0.1111915  0.8250717 0.1347659 0.9515866  40879
## [12] {LiftedDeadlift2Kg?=Not lifted} => {LiftedBench1Kg?=Lifted}    0.1165146  0.8645703 0.1347659 0.9591799  42836
## [13] {LiftedDeadlift2Kg?=Not lifted} => {LiftedDeadlift1Kg?=Lifted} 0.1228957  0.9119202 0.1347659 0.9578229  45182
## [14] {LiftedDeadlift2Kg?=Not lifted} => {Tested=Yes}                0.1314148  0.9751342 0.1347659 0.9876992  48314
## [15] {LiftedSquat2Kg?=Not lifted}    => {LiftedDeadlift2Kg?=Lifted} 0.1589033  0.8057042 0.1972229 0.9311979  58420
## [16] {LiftedSquat2Kg?=Not lifted}    => {LiftedSquat1Kg?=Lifted}    0.1599641  0.8110829 0.1972229 0.9354529  58810
## [17] {LiftedSquat2Kg?=Not lifted}    => {LiftedBench1Kg?=Lifted}    0.1701070  0.8625117 0.1972229 0.9568960  62539
## [18] {LiftedSquat2Kg?=Not lifted}    => {LiftedDeadlift1Kg?=Lifted} 0.1837261  0.9315662 0.1972229 0.9784577  67546
## [19] {LiftedSquat2Kg?=Not lifted}    => {Tested=Yes}                0.1926206  0.9766646 0.1972229 0.9892494  70816
## [20] {LiftedBench2Kg?=Not lifted}    => {LiftedDeadlift2Kg?=Lifted} 0.1789797  0.8200523 0.2182540 0.9477809  65801
#Quick check
df_mca %>% select(contains("1")) %>% group_by(`LiftedSquat1Kg?`) %>% summarize(n = n())
## # A tibble: 2 × 2
##   `LiftedSquat1Kg?`      n
##   <fct>              <int>
## 1 Not lifted         48879
## 2 Lifted            318766
df_mca %>% select(contains("1")) %>% group_by(`LiftedBench1Kg?`) %>% summarize(n = n())
## # A tibble: 2 × 2
##   `LiftedBench1Kg?`      n
##   <fct>              <int>
## 1 Not lifted         36263
## 2 Lifted            331382
df_mca %>% select(contains("1")) %>% group_by(`LiftedDeadlift1Kg?`) %>% summarize(n = n())
## # A tibble: 2 × 2
##   `LiftedDeadlift1Kg?`      n
##   <fct>                 <int>
## 1 Not lifted            17619
## 2 Lifted               350026
df_mca %>% select(contains("2")) %>% group_by(`LiftedSquat2Kg?`) %>% summarize(n = n())
## # A tibble: 2 × 2
##   `LiftedSquat2Kg?`      n
##   <fct>              <int>
## 1 Not lifted         72508
## 2 Lifted            295137
df_mca %>% select(contains("2")) %>% group_by(`LiftedBench2Kg?`) %>% summarize(n = n())
## # A tibble: 2 × 2
##   `LiftedBench2Kg?`      n
##   <fct>              <int>
## 1 Not lifted         80240
## 2 Lifted            287405
df_mca %>% select(contains("2")) %>% group_by(`LiftedDeadlift2Kg?`) %>% summarize(n = n())
## # A tibble: 2 × 2
##   `LiftedDeadlift2Kg?`      n
##   <fct>                 <int>
## 1 Not lifted            49546
## 2 Lifted               318099
df_mca %>% select(contains("3")) %>% group_by(`LiftedSquat3Kg?`) %>% summarize(n = n())
## # A tibble: 2 × 2
##   `LiftedSquat3Kg?`      n
##   <fct>              <int>
## 1 Not lifted        137288
## 2 Lifted            230357
df_mca %>% select(contains("3")) %>% group_by(`LiftedBench3Kg?`) %>% summarize(n = n())
## # A tibble: 2 × 2
##   `LiftedBench3Kg?`      n
##   <fct>              <int>
## 1 Not lifted        195228
## 2 Lifted            172417
df_mca %>% select(contains("3")) %>% group_by(`LiftedDeadlift3Kg?`) %>% summarize(n = n())
## # A tibble: 2 × 2
##   `LiftedDeadlift3Kg?`      n
##   <fct>                 <int>
## 1 Not lifted           155407
## 2 Lifted               212238
df %>%
  ggplot(aes(x = TotalKg, fill = Sex)) + geom_boxplot() + 
  facet_wrap(~Sex) + scale_fill_viridis_d() 

df %>%
  filter(!is.na(Country)) %>%
  ggplot(aes(x = TotalKg, fill = Country)) + 
  geom_boxplot(show.legend = F) + 
  facet_wrap(~Country) +
  scale_fill_viridis_d() 

df %>%
  group_by(Date) %>%
  summarize(m = median(TotalKg)) %>%
  ggplot(aes(x = Date, y = m)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

df %>%
  group_by(Tested) %>%
  summarize(n = n()) %>%
  ggplot(aes(x = Tested, y = n)) + geom_col()

round(cor(numeric_cols, use = "complete.obs"), 1) %>%
  ggcorrplot(ggtheme = ggplot2::theme_gray,
             colors = c("#6D9EC1", "white", "#E46726"),
             lab = T, type = "lower")

##World map over total df
world <- map_data("world")


world %>%
  merge(df %>%
          group_by(Country) %>%
          summarize(n = mean(TotalKg, na.rm = T)) %>%
          mutate(country = Country), 
        by.x = "region", by.y = "country", all.x = T) %>%
  arrange(group, order) %>%
  ggplot(aes(x = long, y = lat, group = group, fill = n)) + 
  geom_polygon(color = "white", size = 0.2) +
  scale_fill_viridis(option="mako", na.value = "gray90") +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        panel.grid = element_blank()) + 
  ggtitle("Mean KG lifted by country")

world %>%
  merge(df %>%
          group_by(Country) %>%
          summarize(n = mean(Age, na.rm = T)) %>%
          mutate(country = Country), 
        by.x = "region", by.y = "country", all.x = T) %>%
  arrange(group, order) %>%
  ggplot(aes(x = long, y = lat, group = group, fill = n)) + 
  geom_polygon(color = "white", size = 0.2) +
  scale_fill_viridis(option="mako", na.value = "gray90") +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        panel.grid = element_blank()) + 
  ggtitle("Mean Age by country")